#!/usr/bin/env perl

#
#  Copyright (c) 2008, detrox@gmail.com
#  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#      * Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#      * Redistributions in binary form must reproduce the above copyright
#        notice, this list of conditions and the following disclaimer in the
#        documentation and/or other materials provided with the distribution.
#      * Neither the name of the <organization> nor the
#        names of its contributors may be used to endorse or promote products
#        derived from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY detrox@gmail.com ''AS IS'' AND ANY
#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL detrox@gmail.com BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

use strict;
use utf8;
use Encode qw/encode decode/;
use Getopt::Std;

use vars '$opt_v', '$opt_h';

################################################################################
#
# Generate Character Sequence
#
################################################################################
sub generate
{
	$_ = shift @_;
	my @chs = split//;
	my @tags = qw/B B2 B3 M/;

	my $index = 0;
	foreach (@chs) {
		my $tag = (@chs == 1)?"S":$tags[($index >= @tags - 1)?@tags - 1:$index];
		$tag = 'E' if @chs > 1 && $index == @chs - 1;
		print encode("UTF-8", "$_ $tag\n");
		$index++;	
	}
}

################################################################################
#
# Build Training Set
#
################################################################################
sub build_crf_training_set
{
	my $file = shift @_;
	my $i = 0;

	open FH, "<$file" or die "can not open $file";
	print STDERR "Building CRF Traning Set from $file: \n" if ($opt_v);

	while (<FH>) {
		s/^\s+|\s+$//g;
		$_ = decode("UTF-8", $_);
		foreach(split(/\s+/)) {
			print "\n" if (/。？！\.\?!/) 
			&generate($_);
			$i++;
			print STDERR "\r\t\t $i items generated." if ($opt_v && $i % 500 == 0);
		}
		print "\n";
	}
	print STDERR "\r\t\t $i items generated.\n" if ($opt_v);
	close FH;
}

################################################################################
#
# main
#
################################################################################


if (!getopts("vh") || $opt_h || !$ARGV[0]) {
	print "Usage: crf_tool [OPTIONS] CORPUS\n";
	print "Convert Corpus to CRF Training Set\n";
	print "OPTIONS:\n";
	print "        -v            verbose\n";
	print "\n";
	print "Author: jianingy <detrox\@gmail.com>\n";
	exit;
}

&build_crf_training_set($ARGV[0]);
